# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
# Load data and print out a few lines
df_loan_data = pd.read_csv('prosperLoanData.csv')
df_loan_data.sample(5)
# Determine the number of rows and columns
print("The number of rows and columns: {}".format(df_loan_data.shape))
# Inspect data types
df_loan_data.info();
# convert term, prosper score, credit score range and income range into categorical types
df_loan_data["Term"] = df_loan_data["Term"].astype("category").cat.as_ordered()
df_loan_data["ProsperScore"] = df_loan_data["ProsperScore"].astype("category").cat.as_ordered()
df_loan_data["CreditScoreRangeLower"] = df_loan_data["CreditScoreRangeLower"].astype("category").cat.as_ordered()
df_loan_data["CreditScoreRangeUpper"] = df_loan_data["CreditScoreRangeUpper"].astype("category").cat.as_ordered()
# specify order for incomerange
df_loan_data["IncomeRange"] = df_loan_data["IncomeRange"].astype("category").cat.set_categories(['$0', 'Not employed', 'Not displayed', '$1-24,999', '$25,000-49,999', '$50,000-74,999', '$75,000-99,999', '$100,000+'], ordered=True)
# Look for duplicated data
df_loan_data[df_loan_data.duplicated()]
There are 113,937 loans in the dataset with 81 columns describing each loan. Most variables are numeric in nature (61).
I want to investigate the following features relating to the borrower and the loan and determine their influence on borrower APR.
Borrower: ProsperScore, CreditScoreRangeLower, CreditScoreRangeUpper, IncomeRange
Loan: Term, BorrowerRate, LoanOriginalAmount, Investors
Since Borrower APR is the borrower rate plus fees, I expect them to be closely related.
I expect that prosper score, the original loan amount and term will have a significant efffect on borrower APR.
# This function display values on top of bar
def display_value_on_bar(ax):
for rect in ax.patches:
height = rect.get_height()
ax.annotate(f'{int(height)}', xy=(rect.get_x()+rect.get_width()/2, height),
xytext=(0, 5), textcoords='offset points', ha='center', va='bottom', fontsize='x-small')
# This function creates a histogram
def create_hist(s1, feature, bin_size, x_label, y_label, title, center=True, show_value=True):
# Set figure size
fig, ax = plt.subplots(figsize = (16, 6))
# Create bins
bin_edges = np.arange(0, s1[feature].max() + bin_size/2, bin_size)
# Adjust bins to center
if center:
bins=bin_edges - bin_size/2
else:
bins=bin_edges
# Create histogram
ax.hist(data = s1, x = feature, bins=bins, color='b', edgecolor='white')
# Display value on top of bar
if show_value:
display_value_on_bar(ax)
# Add title, x and y label
plt.title(title)
plt.xlabel(x_label)
plt.ylabel(y_label)
# Only keep columns we are interested in
df_loan_data = df_loan_data[["ProsperScore", "CreditScoreRangeLower", "CreditScoreRangeUpper", "IncomeRange", "Term",
"BorrowerRate", "BorrowerAPR", "LoanOriginalAmount", "LoanOriginationDate", "Investors"]]
# Create a copy of the original piece of data
loan_data = df_loan_data.copy()
I'll start by looking at the distribution of the main variable of interest: borrower APR.
# Check for missing data
loan_data.BorrowerAPR.isnull().sum()
# There are a only 25 records without a borrower APR so we can drop them.
loan_data.dropna(subset=["BorrowerAPR"], inplace=True)
# View statistics
loan_data.BorrowerAPR.describe()
# Plot setup
bin_size = 0.01
x_label = 'BorrowerAPR'
x_feature = 'BorrowerAPR'
y_label = 'count'
title = 'Distribution of BorrowerAPR'
# Create plot
create_hist(loan_data, x_feature, bin_size, x_label, y_label, title)
# Let's take a look at the entries with a lower than 0.015 BorrowerAPR
outliers = loan_data[x_feature] < 0.015
print(outliers.sum())
print(loan_data.loc[outliers,:])
The outliers appear to be valid points. It is interesting to note that some of these entries has a 0.00 BorrowerRate.
It is expected that entries which was created before 2009 won't have a ProsperScore.
# Let's take a look at the entries with more than 0.42 BorrowerAPR
outliers = loan_data[x_feature] > 0.42
print(outliers.sum())
print(loan_data.loc[outliers,:])
The outliers appear to be valid points. It makes sense that the entries without a prosper score, credit score and no income info would have the highest BorrowerAPR.
It is expected that entries which was created before 2009 won't have a ProsperScore.
# Create Kernel Density Estimation plot
plt.figure(figsize = (16, 6))
plt.title('Kernel Density Estimation plot of BorrowerAPR')
sb.distplot(loan_data['BorrowerAPR']);
# Remove tails and increase bin size
bin_size = 0.005
create_hist(loan_data, x_feature, bin_size, x_label, y_label, title)
plt.xlim(0.04, 0.42);
Limiting the axes and increasing the bin size revealed that there is a large peak at 0.358.
The borrower APR distribution is slightly skewed to the right, with a large peak at 0.358.
Next I will take a look at the distribution of: Borrower rate.
# Check for missing data
loan_data.BorrowerRate.isnull().sum()
# View statistics
loan_data.BorrowerRate.describe()
# Plot setup
bin_size = 0.01
x_label = 'BorrowerRate'
x_feature = 'BorrowerRate'
y_label = 'count'
title = 'Distribution of BorrowerRate'
# Create plot
create_hist(loan_data, x_feature, bin_size, x_label, y_label, title)
# Let's take a look at the entries with a lower than 0.008 BorrowerRate
outliers = loan_data[x_feature] < 0.008
print(outliers.sum())
print(loan_data.loc[outliers,:])
The outliers appear to be valid points. It is interesting that 8 out of the 12 entries has a 0.00 BorrowerRate.
It is expected that entries which was created before 2009 won't have a ProsperScore.
# Let's take a look at the entries with a higher than 0.4 BorrowerRate
outliers = loan_data[x_feature] > 0.4
print(outliers.sum())
print(loan_data.loc[outliers,:])
The outliers appear to be valid points. It makes sense that the entries without a prosper score, credit score and no income info would have the highest interest rate.
It is expected that entries which was created before 2009 won't have a ProsperScore.
# Create Kernel Density Estimation plot
plt.figure(figsize = (16, 6))
plt.title('Kernel Density Estimation plot of BorrowerRate')
sb.distplot(loan_data['BorrowerRate']);
# Remove tails and increase bin size
bin_size = 0.005
create_hist(loan_data, x_feature, bin_size, x_label, y_label, title)
plt.xlim(0.04, 0.38);
Limiting the axes and increasing the bin size revealed that there is a large peak at 0.32.
The borrower rate distribution is slightly skewed to the right, with a large peak at 0.32. As expected the borrower rate distribution has a similar shape to the borrower APR, just moved lower, as it excludes fees which are included in the APR.
Let's start with the first variable of interest: the original loan amount. Let's determine what is the most popular loan amount?
# Check for missing data
print(loan_data['LoanOriginalAmount'].isnull().sum())
# View statistics
loan_data.LoanOriginalAmount.describe()
# plot setup
bin_size = 1000
x_label = 'LoanOriginalAmount'
x_feature = 'LoanOriginalAmount'
y_label = 'count'
title = 'Distribution of LoanOriginalAmount'
# Create plot
create_hist(loan_data, x_feature, bin_size, x_label, y_label, title)
# Let's take a look at the entries with a 35000 LoanOriginalAmount
outliers = loan_data[x_feature] == 35000
print(outliers.sum())
print(loan_data.loc[outliers,:])
The outliers appear to be valid points. It is interesting to note that all of these entries fall in the highest income range and in the higher credit score ranges. The BorrowerRate and BorrowerAPR is also quite low for these entries.
Borrowers with more income at their disposal takes out larger loans with lower interest rates.
# Create Kernel Density Estimation plot
plt.figure(figsize = (14, 6))
sb.distplot(loan_data['LoanOriginalAmount']);
# Remove tails and increase bin size
bin_size = 200
create_hist(loan_data, x_feature, bin_size, x_label, y_label, title)
plt.xlim(600, 11000);
Limiting the axes and increasing the bin size revealed that there are peaks at round numbers. The smallest loan value is a \$1000.
The most popular loan amount is \$4000. The distribution is skewed to the right, with most loans below a value of \\$12000. Round numbers are popular with peaks at 2000, 3000, 4000, 5000, 10000, 15000, 20000 and 25000.
Let's look at the second variable of interest: the loan term. Let's determine which term is the most popular?
# Check for missing data
print(loan_data['Term'].isnull().sum())
plt.figure(figsize = (14, 6))
ax = sb.countplot(data=loan_data, x='Term', color='b');
plt.title('Distribution of Term')
display_value_on_bar(ax)
36 months is the most popular term, with 60 months the second most popular and 12 months the least popular.
Let's take a look at investors and determine how many investors a loan generally has?
# Check for missing data
print(loan_data['Investors'].isnull().sum())
# View statistics
loan_data.Investors.describe()
# plot setup
bin_size = 25
x_label = 'Investors'
x_feature = 'Investors'
y_label = 'count'
title = 'Distribution of Investors'
# Create plot
create_hist(loan_data, x_feature, bin_size, x_label, y_label, title, center=False)
# Let's take a look at the entries with more than 1000 investors
outliers = loan_data[x_feature] > 1000
print(outliers.sum())
print(loan_data.loc[outliers,:])
The outliers appear to be valid points. It is interesting to note that these entries fall in a very high creditscore range and their borrower rate and APR is on the lower end.
With a large number of investors a higher amount can be borrowed with a lower interest rate.
# Create Kernel Density Estimation plot
plt.figure(figsize = (14, 6))
sb.distplot(loan_data['Investors']);
# View statistics in log
np.log10(loan_data[x_feature].describe())
# There's a long tail in the distribution, so let's plot it on a log scale
log_binsize = 0.1
bins = 10 ** np.arange(0, np.log10(loan_data[x_feature].max())+log_binsize, log_binsize)
plt.figure(figsize=[16, 6])
plt.hist(data = loan_data, x = x_feature, bins = bins)
plt.xscale('log')
plt.xticks([10e0, 10e1, 10e2, 10e3], [10, 100, 1000, 10000])
plt.xlabel(x_label)
plt.title('Distribution of Investors on a log scale.')
plt.show()
When plotted on a log-scale, the investors distribution has a very large spike at 1, and a normal distribution around 90.
A loan generally only has a single investor. The distribution is right skewed with very few loans with more than 800 investors.
Next we move to the credit score range. Let's determine the most popular range.
# Check for missing data
print(loan_data['CreditScoreRangeLower'].isnull().sum())
The number of missing data points is the same for both the lower and upper range.
# View statistics
print("Lower range")
print(loan_data.CreditScoreRangeLower.describe())
print("\nUpper range")
print(loan_data.CreditScoreRangeUpper.describe())
# As a small number of values are missing we replace the missing values with the mode
loan_data['CreditScoreRangeLower'] = loan_data['CreditScoreRangeLower'].fillna(680.00)
loan_data['CreditScoreRangeUpper'] = loan_data['CreditScoreRangeUpper'].fillna(699.00)
# let's plot the upper and lower bound together as they should have the same distribution
fig, ax = plt.subplots(nrows=2, figsize = (14, 10))
sb.countplot(data=loan_data, x='CreditScoreRangeLower', color='b', ax = ax[0]);
sb.countplot(data=loan_data, x='CreditScoreRangeUpper', color='b', ax = ax[1]);
ax[0].tick_params(axis='x', rotation=90)
ax[0].set_title('Distribution of CreditScoreRange')
ax[1].tick_params(axis='x', rotation=90)
display_value_on_bar(ax[0])
display_value_on_bar(ax[1])
# Let's take a look at the entries which fall in the lowest credit score range
outliers = loan_data['CreditScoreRangeLower'] == 0.0
print(outliers.sum())
print(loan_data.loc[outliers,:])
It is expected that entries which was created before 2009 won't have a ProsperScore.
It is interesting to note that these entries has their IncomeRange as "Not displayed", don't have a ProsperScore and falls in the lowest CreditScoreRange, this most likely means that these entries don't have a valid credit score.
# remove outlier points
loan_data = loan_data.loc[-outliers,:]
# Create Kernel Density Estimation plot
plt.figure(figsize = (14, 6))
sb.distplot(loan_data['CreditScoreRangeLower']);
The most popular credit score range is 680-699. The credit score range has a left skewed distribution.
Next up let's look at the prosper score and determine which score is the most popular?
# Check for missing data
loan_data.ProsperScore.isnull().sum()
# lets check if there are entries which was created after July 2009 which should have a ProsperScore which don't
missing_score = (loan_data['ProsperScore'].isnull()) & (loan_data['LoanOriginationDate']> '2009-07-01')
print(missing_score.sum())
print(loan_data.loc[missing_score,:])
# There are a only 131 records without a ProsperScore after 2009 so we can drop them.
loan_data = loan_data.loc[-missing_score,:]
# View statistics
loan_data.ProsperScore.describe()
# Plot distribution including missing values
plt.figure(figsize = (14, 6))
ax = sb.countplot(data=loan_data, x='ProsperScore', color='b');
plt.title('Distribution of ProsperScore')
display_value_on_bar(ax)
The most popular prosper scores are 4, 6 and 8.
# Replace missing values with mode 4.0
loan_data['ProsperScoreMod'] = loan_data['ProsperScore'].fillna(4.0)
# Plot of distribution after replacing missing values with the mode
plt.figure(figsize = (14, 6))
ax = sb.countplot(data=loan_data, x='ProsperScoreMod', color='b');
display_value_on_bar(ax)
After the missing values were replaced with the mode 4 became the most popular prosper score by quite a large margin.
As quite a large number of ProsperScore values were missing due to the loans being created before 2009, replacing the missing values with the mode had a significant effect on the distribution.
For the rest of the investigation we will use the ProsperScore values without replacing the missing values with the mode.
# Create Kernel Density Estimation plot
plt.figure(figsize = (14, 6))
plt.title('Distribution of ProsperScore')
sb.distplot(loan_data['ProsperScore'].dropna());
Finally let's move to income range and determine the most popular income range.
# Check for missing data
print(loan_data['IncomeRange'].isnull().sum())
plt.figure(figsize = (14, 6))
ax = sb.countplot(data=loan_data, x='IncomeRange', color='b');
plt.title('Distribution of IncomeRange')
display_value_on_bar(ax)
The most popular income range is \$25000-49999. People in the \\$25000-49999 and \$50000-74999 income ranges borrows the most money.
25 entries were identified to not have a borrower APR and were removed as this is not a significant number. The borrower APR distribution is multimodal and slightly skewed to the right, with a peak around 0.17, 0.29 and a large peak around 0.358. There was no need to perform a transformation.
The number of investors took on a large range of values, from 1 to 1189, so a log transformation was performed. Under the transformation, a large spike at 1 was revealed and a normal distribution around 90.
133 outlier points were identified, when investigating the credit score range. Overall, these points can be characterized by falling in a very low credit score range 0-19, by having their income range as "Not displayed" and not having a prosper score. All of these points were removed from the dataset for safety as these entries most lileky don't have a valid credit score.
28926 entries were identified to not have a ProsperScore of these 131 were created after 2009 and should have a ProsperScore. The 131 were removed from the dataset as they are erroneous. The remaining values were kept as Nan as changing these missing values to the mode has quite a significant influence on the distribution.
To start off with, I want to look at the pairwise correlations present between features in the data.
numeric_vars = ['BorrowerAPR', 'BorrowerRate', 'LoanOriginalAmount', 'Investors']
categoric_vars1 = ['Term', 'ProsperScore']
categoric_vars2 = ['CreditScoreRangeLower', 'IncomeRange']
categoric_vars = categoric_vars1 + categoric_vars2 + ['CreditScoreRangeUpper']
# correlation plot
plt.figure(figsize = [14, 6])
sb.heatmap(loan_data[numeric_vars].corr(), annot = True, fmt = '.3f',
cmap = 'vlag_r', center = 0)
plt.title("Correlation plot of BorrowerAPR, BorrowerRate, LoanOriginalAmount and Investors")
plt.show()
# plot matrix: sample 10000 laons so that plots are clearer and they render faster
print("loan_data.shape=",loan_data.shape)
loan_data_samp = loan_data.sample(n=10000, replace = False)
print("loan_data_samp.shape=",loan_data_samp.shape)
g = sb.PairGrid(data = loan_data_samp, vars = numeric_vars)
g = g.map_diag(plt.hist, bins = 50);
g.map_offdiag(sb.regplot)
plt.suptitle('Plot Matrix of BorrowerAPR, BorrowerRate, LoanOriginalAmount and Investors')
plt.subplots_adjust(top=0.95)
It is confirmed that BorrowerAPR and BorrowerRate are highly correlated with one another, which is expected since BorrowerAPR is BorrowerRate plus fees.
BorrowerAPR and BorrowerRate are both negatively correlated with LoanOriginalAmount and Investors. LoanOriginalAmount and Investors are positevly correlated with one another.
A larger amount being borrowed or a higher number of investors results in a lower interest rate and APR.
Let's move on to looking at how borrower APR correlate with the categorical variables.
# Function to create violin plot
def violingrid(x, y, **kwargs):
""" Quick hack for creating violin plots with seaborn's PairGrid. """
ax = sb.violinplot(x=x, y=y, color='b', inner='quartile')
ax.tick_params(rotation=90)
# Function to create box plot
def boxgrid(x, y, **kwargs):
""" Quick hack for creating box plots with seaborn's PairGrid. """
ax = sb.boxplot(x=x, y=y, color='b')
ax.tick_params(rotation=90)
# Function to create reg plot
def reggrid(x, y, **kwargs):
""" Quick hack for creating reg plots with seaborn's PairGrid. """
ax = sb.regplot(x=x, y=y, color='b')
ax.tick_params(rotation=90)
# plot matrix of numeric features against first group of categorical features.
plt.figure(figsize = [4, 2])
g = sb.PairGrid(data = loan_data_samp, y_vars = numeric_vars, x_vars = categoric_vars1, size=6)
g.map(violingrid)
plt.suptitle('BorrowerAPR, BorrowerRate, LoanOriginalAmount and Investors by Term and ProsperScore')
plt.subplots_adjust(top=0.95)
plt.show();
Term seems to have no effect on BorrowerAPR, there is no correlation. There is a negative relationship between BorrowerAPR and ProsperScore. Term has a slightly positive relationship with BorrowerRate. There is a negative relationship between BorrowerRate and ProsperScore. There is a positive relationship between LoanOriginalAmount and both Term and ProsperScore. Term has a slightly negative relationship with Investors. There is a positive relationship between Investors and ProsperScore.
A higher prosper score results in a lower interest rate and APR, a higher number of investors and a larger amount being borrowed. A longer loan period results in a higher interest rate, a larger amount being borrowed and fewer investors. The loan period has no effect on APR.
# plot matrix of numeric features against second group of categorical features.
plt.figure(figsize = [4, 2])
g = sb.PairGrid(data = loan_data_samp, y_vars = numeric_vars, x_vars = categoric_vars2, size=6)
g.map(violingrid)
plt.suptitle('BorrowerAPR, BorrowerRate, LoanOriginalAmount and Investors by CreditScoreRangeLower and IncomeRange')
plt.subplots_adjust(top=0.95)
plt.show();
There is a negative relationship between BorrowerAPR and both CreditScoreRange and IncomeRange. There is a negative relationship between BorrowerRate and both CreditScoreRange and IncomeRange. There is a positive relationship between LoanOriginalAmount and both CreditScoreRange and IncomeRange. There is a positive relationship between Investors and both CreditScoreRange and IncomeRange.
A higher credit score range and a higher income range results in a lower interest rate, a lower APR, a larger amount being borrowed, and a higher number of investors.
Finally, let's look at relationships between the five categorical variables.
plt.figure(figsize = [14, 30])
# subplot 1: ProsperScore vs Term
plt.subplot(6, 1, 1)
ax1 = sb.countplot(data = loan_data_samp, x = 'ProsperScore', hue = 'Term', palette = 'Blues')
ax1.set_title('ProsperScore vs Term')
plt.legend(loc='upper left', title='Term')
# subplot 2: CreditScoreRangeLower vs Term
plt.subplot(6, 1, 2)
ax2 = sb.countplot(data = loan_data_samp, x = 'CreditScoreRangeLower', hue = 'Term', palette = 'Blues')
ax2.set_title('CreditScoreRangeLower vs Term')
plt.legend(loc='upper left', title='Term')
# subplot 3: IncomeRange vs Term
plt.subplot(6, 1, 3)
ax3 = sb.countplot(data = loan_data_samp, x = 'IncomeRange', hue = 'Term', palette = 'Blues')
ax3.set_title('IncomeRange vs Term')
plt.legend(loc='upper left', title='Term')
# subplot 4: CreditScoreRangeLower vs ProsperScore
plt.subplot(6, 1, 4)
ax4 = sb.countplot(data = loan_data_samp, x = 'CreditScoreRangeLower', hue = 'ProsperScore', palette = 'Greens')
ax4.set_title('CreditScoreRangeLower vs ProsperScore')
plt.legend(loc='upper left', title='ProsperScore')
# subplot 5: IncomeRange vs ProsperScore
plt.subplot(6, 1, 5)
ax5 = sb.countplot(data = loan_data_samp, x = 'IncomeRange', hue = 'ProsperScore', palette = 'Greens')
ax5.set_title('IncomeRange vs ProsperScore')
plt.legend(loc='upper left', title='ProsperScore')
# subplot 6: CreditScoreRangeLower vs IncomeRange
plt.subplot(6, 1, 6)
ax6 = sb.countplot(data = loan_data_samp, x = 'CreditScoreRangeLower', hue = 'IncomeRange', palette = 'Reds')
ax6.set_title('CreditScoreRangeLower vs IncomeRange')
plt.legend(loc='upper left', title='IncomeRange')
plt.show()
The most popular term is 36 months across the entire prosper score range. All credit score ranges are biased towards a 36 month loan term. All income ranges are biased towards a 36 months loan term. There is a positive relationship between credit score range and prosper score. There is a positive relationship between income range and prosper score. There is a positive relationship between credit score range and income range.
36 months is dominant in the categorical variables. A higher credit score results in a higher income range. A higher credit score or a higher income range results in a higher prosper score.
BorrowerAPR is highly correlated with the BorrowerRate, which is expected. BorrowerAPR has a negative relationship with LoanOriginalAmount, Investors, CreditScoreRange, IncomeRange and ProsperScore. A larger loan amount, higher number of investors, a bigger income, a higher prosper score and a better credit score range results in a lower interest rate.
Interestingly there is no correlation between Term and BorrowerAPR.
LoanOriginalAmount has a positive relationship with Investors, Term, ProsperScore, CreditScoreRange and IncomeRange.
Investors has a positive relationship with LoanOriginalAmount, ProsperScore, CreditScoreRange and IncomeRange. Investors has a negative relationship with Term.
ProsperScore has a positive relationship with LoanOriginalAmount, CreditScoreRange, IncomeRange and Investors. There is an interesting relationship between ProsperScore and Term, it's relationship doesn't seem to remain the same.
Term has complex relationships with all of the categorical variables, some of these relationships might be interesting to investigate in the multivarite section.
The main thing I want to explore in this part of the analysis is how Prosper score and term play into the relationship between borrower APR, loan original amount and investors respectively.
# Term effect on relationship between APR and investors
g=sb.FacetGrid(data=loan_data, col='Term', col_wrap=3, size=5, aspect=1.2)
g.map(sb.regplot, 'Investors', 'BorrowerAPR', x_jitter=0.04, scatter_kws={'alpha':0.1});
plt.suptitle('Term effect on the relationship between BorrowerAPR and Investors.')
plt.subplots_adjust(top=0.85)
g.add_legend();
As term increases the strong negative correlation between the number of investors and BorrowerAPR changes to slighlty negative correlation at a term of 60 months.
# Term effect on relationship betweem APR and loan amount
g=sb.FacetGrid(data=loan_data, col='Term', col_wrap=3, size=5, aspect=1.2)
g.map(sb.regplot, 'LoanOriginalAmount', 'BorrowerAPR', x_jitter=0.04, scatter_kws={'alpha':0.1});
plt.suptitle('Term effect on the relationship between BorrowerAPR and LoanOriginalAmount.')
plt.subplots_adjust(top=0.85)
g.add_legend();
As term increases the offset value of BorrowerAPR increaes while the negative relationship betweem BorrowerAPR and LoanOriginalAmount remaines the same.
# Prosper score effect on relationship between APR and investors
g=sb.FacetGrid(data=loan_data, col='ProsperScore', col_wrap=3, size=5, aspect=1.2)
g.map(sb.regplot, 'Investors', 'BorrowerAPR', x_jitter=0.04, scatter_kws={'alpha':0.1});
plt.suptitle('ProsperScore effect on the relationship between BorrowerAPR and Investors.')
plt.subplots_adjust(top=0.95)
g.add_legend();
As prosper score increases the the offset value of BorrowerAPR decreases while the correlation betweem BorrowerAPR and investors swings from slightly positive to negative.
# Prosper score effect on relationship between APR and loan amount
g=sb.FacetGrid(data=loan_data, col='ProsperScore', col_wrap=3, size=5, aspect=1.2)
g.map(sb.regplot, 'LoanOriginalAmount', 'BorrowerAPR', x_jitter=0.04, scatter_kws={'alpha':0.1});
plt.suptitle('ProsperScore effect on the relationship between BorrowerAPR and LoanOriginalAmount.')
plt.subplots_adjust(top=0.95)
g.add_legend();
As prosper score increases the negative correlation beween BorrowerAPR and LoanOriginalAmount decreases and travels through no correlation to finally end on positive correlation at a prosper score of 11.
# Prosper score and Term effect on BorrowerAPR
fig = plt.figure(figsize = [14,6])
ax = sb.pointplot(data = loan_data, x = 'ProsperScore', y = 'BorrowerAPR', hue = 'Term',
palette = 'Blues', linestyles = '', dodge = 0.4)
ax.tick_params(rotation=90)
plt.title('Borrower APR across prosper score and term')
plt.show();
As term increases the negative relationship between BorrowerAPR and ProsperScore decreases. For a prosper score between 1 and 3 a lower term results in a lower BorrowerAPR. For a prosper score between 3 and 6, a higher term results in lower BorrowerAPR. At a prosper score of 7 or higher, a higher term results in a higher BorrowerAPR.
# Prosper score and Term effect on Investors
fig = plt.figure(figsize = [14,6])
ax = sb.pointplot(data = loan_data, x = 'ProsperScore', y = 'Investors', hue = 'Term',
palette = 'Blues', linestyles = '', dodge = 0.4)
ax.tick_params(rotation=90)
plt.title('Investors across prosper score and term')
plt.show();
For a prosper score between 1 and 5 a lower term results in a higher number of investors. For a prosper score between 4 and 8, term has no effect. At a prosper score of 8, 9 and 11, a 36 month term results in a higher number of investors. At a prosper score of 10, a higher term results in a higher number of invetors.
# Prosper score and Term effect on LoanOriginalAmount
fig = plt.figure(figsize = [14,6])
ax = sb.pointplot(data = loan_data, x = 'ProsperScore', y = 'LoanOriginalAmount', hue = 'Term',
palette = 'Blues', linestyles = '', dodge = 0.4)
ax.tick_params(rotation=90)
plt.title('Loan Original Amount across prosper score and term')
plt.show();
As ProsperScore and term increases the LoanOriginalAmount also increases.
I extended my investigation of which variables effect BorrowerAPR in this section, by creating multivariable plots of Term, ProsperScore, Investors, LoanOriginalAmount and BorrowerAPR. The multivirate exploration revealed the following:
Term has a similar effect on the relationship between BorrowerAPR and Investors that it has on BorrowerAPR on its own. It moves towards slightly negative at a higher term.
The negative correlation between BorrowerAPR and LoanORiginalAmount remains intact, but there is an offset with an increase in term.
ProsperScore has a surprising effect on the relationship of BorrowerAPR with both LoanOriginalAmount and Investors. An increase in ProsperScore changes the correlation between BorrowerAPR and both LoanOriginalAmount and Investors. In both cases the correlation changes from either positive to negative or vice versa.
The relationship between Term, ProsperScore and BorrowerAPR changes at a ProsperScore of 3 and 6. For a prosper score between 1 and 3 a lower term results in a lower BorrowerAPR. For a prosper score between 3 and 6, a higher term results in lower BorrowerAPR. At a prosper score of 7 or higher, a higher term results in a higher BorrowerAPR.
There was a suprising interaction between Prosper score, term and number of investors. For a prosper score between 1 and 5 a lower term results in a higher number of investors. For a prosper score between 4 and 8, term has no effect. At a prosper score of 8, 9 and 11, a 36 month term results in a higher number of investors. At a prosper score of 10, a higher term results in a higher number of invetors.
In order to understand BorrowerAPR and the relationship it has with ProsperScore, CreditScoreRangeLower, CreditScoreRangeUpper, IncomeRange, Term, BorrowerRate, LoanOriginalAmount and Investors each variables distribution was plotted to begin with.
From the BorrowerAPR distribution graph we saw that the borrower APR distribution span across the 0.006-0.52 range and is multimodal and slightly skewed to the right, with a peak around 0.17, 0.29 and a large peak around 0.358.
From the BorrowerRate distribution graph we saw that BorrowerRate span from 0-0.5 and is slightly skewed to the right, with a large peak at 0.32. As expected the borrower rate distribution has a similar shape to the borrower APR, just moved lower, as it excludes fees which are included in the APR.
From the LoanOriginalAmount distribution graph we saw that the value of loans ranged from \$1000 - \\$35000. The distribution is skewed to the right, with most loans below a value of \$12000. Round numbers are popular with peaks at 2000, 3000, 4000, 5000, 10000, 15000, 20000 and 25000. The most popular loan amount is \\$4000.
From the Term distribution graph we saw that there are three terms on which money is borrowed, 12,36 or 60 months. 36 months is the most popular term.
From the Investors distribution graph we saw that the number of investors ranged from 1 to 1189 and is right skewed with very few loans with more than 800 investors. As there was a very large tail Investors was plotted on a log-scale, which revealed a very large spike at 1, and a normal distribution around 90.
From the CreditScoreRangeLower and Upper distribution we saw that the credit score ranges were from 0-900 and left skewed. 680-699 is the most popular credit score range.
From the ProsperScore distribution we saw that the ProsperScore could take on a value of 1 to 11 and that the most popular prosper scores are 4, 6 and 8.
From the IncomeRange distribution we saw that the IncomeRange could take on a value in the following list: '\$0', 'Not employed', 'Not displayed', '\\$1-24,999', '\$25,000-49,999', '\\$50,000-74,999', '\$75,000-99,999', '\\$100,000+'. The most popular income range is \$25000-49999. People in the \\$25000-49999 and \$50000-74999 income ranges borrows the most money.
To determine the pairwise relationship between features Bivariate exploration was performed.
From the plot matrix graph we saw that BorrowerAPR and BorrowerRate are highly correlated with one another, which is expected since BorrowerAPR is BorrowerRate plus fees. We also saw that as the loan amount increased the number of investors also increased. A larger amount being borrowed or a higher number of investors results in a lower interest rate and APR.
From the BorrowerAPR, BorrowerRate, LoanOriginalAmount and Investors by Term and ProsperScore graph we saw that a higher ProsperScore results in a lower interest rate and APR, a higher number of investors and a larger amount being borrowed. We also saw that a longer loan period results in a higher interest rate, a larger amount being borrowed and fewer investors. The loan period has no effect on APR.
From the BorrowerAPR, BorrowerRate, LoanOriginalAmount and Investors by CreditScoreRangeLower and IncomeRange graph we saw that A higher credit score range or a higher income range results in a lower interest rate, a lower APR, a larger amount being borrowed, and a higher number of investors.
From the ProsperScore, CreditScoreRange and IncomeRange vs Term graphs we saw that 36 months receives preference irrespective of ProsperScore, CreditScoreRange or IncomeRange.
From the CreditScoreRange vs IncomeRange graphs we saw that a higher credit score results in a higher income range.
From the CreditScoreRange, IncomeRange vs ProsperScore graphs we saw that a higher credit score or a higher income range results in a higher prosper score.
To extended our exploration of the relationships between variables multivariate plots were created.
From the Term effect on the relationship between BorrowerAPR and Investors graph we saw that as term increases the strong negative correlation between the number of investors and BorrowerAPR changes to slightly negative correlation at a term of 60 months.
From the Term effect on the relationship between BorrowerAPR and LoanOriginalAmount we saw that as term increases the offset value of BorrowerAPR increaes while the negative relationship betweem BorrowerAPR and LoanOriginalAmount remaines the same.
From the ProsperScore effect on the relationship between BorrowerAPR and Investors graph we saw that as prosper score increases the the offset value of BorrowerAPR decreases while the correlation betweem BorrowerAPR and investors swings from slightly positive to negative.
From the ProsperScore effect on the relationship between BorrowerAPR and LoanOriginalAmount.graph we saw that as prosper score increases the negative correlation beween BorrowerAPR and LoanOriginalAmount decreases and travels through no correlation to finally end on positive correlation at a prosper score of 11.
From the Borrower APR across prosper score and term graph we saw that for a prosper score between 1 and 3 a lower term results in a lower BorrowerAPR. For a prosper score between 3 and 6, a higher term results in lower BorrowerAPR. At a prosper score of 7 or higher, a higher term results in a higher BorrowerAPR.
From the Investors across prosper score and term graph we saw that for a prosper score between 1 and 5 a lower term results in a higher number of investors. For a prosper score between 4 and 8, term has no effect. At a prosper score of 8, 9 and 11, a 36 month term results in a higher number of investors. At a prosper score of 10, a higher term results in a higher number of invetors.
From the Loan Original Amount across prosper score and term graph we saw that as ProsperScore and term increases the LoanOriginalAmount also increases.